A Netflix Analysis

Installing pacman

install.packages("pacman")
Error in install.packages : Updating loaded packages
library(pacman)

Loading the packages needed

pacman::p_load(dplyr,tidyverse,tidyr,  janitor, lubridate, ggplot2, leaflet, plotly, readxl)

Loading our dataset

netflix= read.csv("netflix_titles.csv")

Preview of our dataset

head(netflix, 5)

Lets check the column names

names(netflix)
 [1] "show_id"      "type"         "title"        "director"     "cast"         "country"      "date_added"  
 [8] "release_year" "rating"       "duration"     "listed_in"    "description"  "X"            "X.1"         
[15] "X.2"          "X.3"          "X.4"          "X.5"          "X.6"          "X.7"          "X.8"         
[22] "X.9"          "X.10"         "X.11"         "X.12"         "X.13"        

Lets look at each column and its datatype

str(netflix)
'data.frame':   8809 obs. of  26 variables:
 $ show_id     : chr  "s1" "s2" "s3" "s4" ...
 $ type        : chr  "Movie" "TV Show" "TV Show" "TV Show" ...
 $ title       : chr  "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
 $ director    : chr  "Kirsten Johnson" "" "Julien Leclercq" "" ...
 $ cast        : chr  "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
 $ country     : chr  "United States" "South Africa" "" "" ...
 $ date_added  : chr  "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
 $ release_year: int  2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
 $ rating      : chr  "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
 $ duration    : chr  "90 min" "2 Seasons" "1 Season" "1 Season" ...
 $ listed_in   : chr  "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
 $ description : chr  "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
 $ X           : logi  NA NA NA NA NA NA ...
 $ X.1         : logi  NA NA NA NA NA NA ...
 $ X.2         : logi  NA NA NA NA NA NA ...
 $ X.3         : logi  NA NA NA NA NA NA ...
 $ X.4         : logi  NA NA NA NA NA NA ...
 $ X.5         : logi  NA NA NA NA NA NA ...
 $ X.6         : logi  NA NA NA NA NA NA ...
 $ X.7         : logi  NA NA NA NA NA NA ...
 $ X.8         : logi  NA NA NA NA NA NA ...
 $ X.9         : logi  NA NA NA NA NA NA ...
 $ X.10        : logi  NA NA NA NA NA NA ...
 $ X.11        : logi  NA NA NA NA NA NA ...
 $ X.12        : logi  NA NA NA NA NA NA ...
 $ X.13        : logi  NA NA NA NA NA NA ...

Lets convert the type to factor

netflix$type= as.factor(netflix$type)

Investigating Null values

NAs=colSums(is.na(netflix))
names(netflix)[NAs>0]
 [1] "X"    "X.1"  "X.2"  "X.3"  "X.4"  "X.5"  "X.6"  "X.7"  "X.8"  "X.9"  "X.10" "X.11" "X.12" "X.13"

Dim

dim(netflix)
[1] 8809   26

Total rows with NAs in each column

colSums(is.na(netflix))
     show_id         type        title     director         cast      country   date_added release_year 
           0            0            0            0            0            0            0            0 
      rating     duration    listed_in  description            X          X.1          X.2          X.3 
           0            0            0            0         8809         8809         8809         8809 
         X.4          X.5          X.6          X.7          X.8          X.9         X.10         X.11 
        8809         8809         8809         8809         8809         8809         8809         8809 
        X.12         X.13 
        8809         8809 

Since the columns with NAs dont have much meaning to the dataset we can remove them

netflix= netflix %>% 
  select(show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description)

Descriptive Statistics:

Summarize the distribution of the types of shows (Movies vs. TV Shows).

#Distribution of movie type
netflix_type= netflix %>% 
  select(type) %>% 
  group_by(type) %>% 
  summarise(totalcount= n())

# Calculate percentage labels
percentages <- round(netflix_type$totalcount / sum(netflix_type$totalcount) * 100, 1)
labels <- paste(netflix_type$type, percentages, "%", sep = " ")

# Create the pie chart using plotly
fig <- plot_ly(netflix_type, labels = ~type, values = ~totalcount, type = 'pie', 
               textinfo = 'label+percent',
               insidetextorientation = 'radial',
               marker = list(colors = c('red', 'green')))

# Customize the layout
fig <- fig %>% layout(title = 'Distribution of TV Shows and Movies')

# Display the plot
fig

Calculate the number of shows released per year.

netflix_releaseyear= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year, type) %>% 
  summarise(total_moveis_or_shows= n(), .groups = "drop")

netflix_release_year= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year) %>% 
  summarise(total_shows= n()) %>% 
  arrange(desc(total_shows)) %>% 
  head(10)

fig4=plot_ly(netflix_release_year, x= ~release_year, y= ~total_shows,type= 'bar',
             text= ~total_shows,
             textposition = "auto",
             marker=list(color="green")) %>% 
  layout(title= "Top 10 years with the highest number of shows produced",
         xaxis= list(title="Years"),
         yaxis= list(title="Total Shows"))
fig4

Analyze the distribution of ratings (e.g., TV-MA, PG-13, etc.).

netflix_ratings= netflix %>%
  select(rating) %>% 
  group_by(rating) %>% 
  summarize(total_rating=n())

#the plot
fig3 =plot_ly(netflix_ratings, x= ~rating, y= ~total_rating, type='bar', marker=list(color= "red"))%>% 
  layout(title="Distribution of the ratings",
         xaxis= list(title="Ratings"),
         yaxis= list(title="Total ratings"))

fig3

Trend Analysis:

Analyze the popularity of different genres over the years.

netflix_genre= netflix %>% 
  select(release_year, listed_in) %>% 
  group_by(release_year, listed_in) %>% 
  summarise(total_shows= n(), .groups = "drop")

Genre Analysis:

Determine the most common genres listed.

netflix_genre_common= netflix %>% 
  select(listed_in) %>% 
  group_by(listed_in) %>% 
  summarise(total_count= n()) %>% 
  arrange(desc(total_count)) %>% 
  head(10)

fig5 = plot_ly(netflix_genre_common, y= ~listed_in, x= ~total_count, type= 'bar',
               text= ~total_count,
               textposition= "auto",
               marker= list(color="orange")) %>% 
  layout(
    title= "Top 10 Most common genres Listed",
    yaxis = list(title="Genres"),
    xaxis = list(title="Total Count")
  )

fig5

Analyze the correlation between genres and ratings.

netflix_corr_genre= netflix %>% 
  select(listed_in, rating) %>% 
  group_by(listed_in, rating) %>% 
  summarise(total_count = n(), .groups = "drop")
 print(netflix_corr_genre)

Country Analysis:

Analyze the Countries With the most movies produced

netflix_country= netflix %>% 
  select(country) %>% 
  mutate(country= paste0(country, ","))

netflix_country = netflix_country %>% 
  separate(col= country, into= c("Country", "Rest"), sep=",")

netflix_country_grouped= netflix_country %>% 
  group_by(Country) %>% 
  summarise(Total_Movies= n()) %>% 
  arrange(desc(Total_Movies))
# Convert empty strings to NA in the 'Country' column
netflix_country_grouped_clean <- netflix_country_grouped %>%
  mutate(Country = na_if(Country, ""))

# Remove rows with NA values
netflix_country_grouped_clean <- na.omit(netflix_country_grouped_clean)
netflix_country_grouped_plot= head(netflix_country_grouped, 10)

fig8 = plot_ly(netflix_country_grouped_plot, x= ~ Country, y= ~Total_Movies, type= 'bar',
               text= ~Total_Movies,
               markers=list(color="yellow")) %>% 
  layout(
    title = "Top 10 Countries with the Highest No of movies Produced",
    xaxis = list(title= "Countries", tickangle= -45),
    yaxis = list(title= "Total Movies Produced"))
 fig8 
NA

Map

# # Get world map data for country coordinates
# world_map <- map_data("world")
# 
# # Prepare the data by merging with coordinates
# country_coords <- world_map %>%
#   group_by(region) %>%
#   summarize(
#     lat = mean(lat),
#     lng = mean(long)
#   ) %>%
#   rename(Country = region)
# 
# # Merge country data with coordinates
# map_data <- netflix_country_grouped_clean %>%
#   left_join(country_coords, by = "Country")
# 
# # Filter out rows with missing or invalid coordinates
# map_data_filtered <- map_data %>%
#   filter(!is.na(lat) & !is.na(lng))
# 
# # Create an interactive map with markers
# m <- leaflet(map_data_filtered) %>%
#   addTiles() %>%
#   addMarkers(
#     clusterOptions = markerClusterOptions(),
#     ~lng, ~lat,
#     popup = ~paste("<strong>Country:</strong>", Country, "<br>",
#                    "<strong>Value:</strong>", Total_Movies)
#   )
# 
# # Display the map
# m

Analyze the diversity of content by country.

United States

netflix_diversity= netflix %>% 
  select(country, listed_in) %>% 
  group_by(country, listed_in) %>% 
  summarise(total_count=n(), .groups = "drop") %>% 
  arrange(desc(total_count))
print(netflix_diversity)

Duration Analysis:

Compare the average duration of movies vs. TV shows.

netflix$duration= as.character(netflix$duration)

netflix_duration <- netflix %>%
  select(type, duration) %>%
  separate(col = duration, into = c("duration", "units"), sep = " ")

# Convert duration back to integer
netflix_duration$duration <- as.integer(netflix_duration$duration)

netflix_duration_compison_type= netflix_duration %>% 
  group_by(type) %>%
  summarise(Average_Duration= floor(mean(duration, na.rm = TRUE)))

print(netflix_duration_compison_type)

Analyze the distribution of the number of seasons for TV shows.

netflix_tvshows_distribution= netflix %>% 
  select(type, duration) %>% 
  filter(type == "TV Show") %>% 
  group_by(duration) %>% 
  summarise(Frequency_totals= n()) %>% 
  arrange(desc(Frequency_totals))

netflix_tvshows_distribution$duration= factor(netflix_tvshows_distribution$duration, levels= unique(netflix_tvshows_distribution$duration))
# Create a bar chart
fig6 = plot_ly(netflix_tvshows_distribution, x = ~duration, y= ~Frequency_totals, type = 'bar',
            marker=list(color="green")) %>%
  layout(
    title= "Distribution of Seasons in the TV Shows",
    xaxis= list(title ="Duration", tickangle=-45),
    yaxis= list(title= "Frequency")
  )

# Display the plot
fig6
---
title: "R Notebook"
output: html_notebook
---

# A Netflix Analysis
#### Installing pacman
```{r, warning = FALSE}
install.packages("pacman")
library(pacman)
```
#### Loading the packages needed
```{r}
pacman::p_load(dplyr,tidyverse,tidyr,  janitor, lubridate, ggplot2, leaflet, plotly, readxl)
```
#### Loading our dataset
```{r}
netflix= read.csv("netflix_titles.csv")
```
#### Preview of our dataset
```{r}
head(netflix, 5)
```
#### Lets check the column names
```{r}
names(netflix)
```
#### Lets look at each column and its datatype
```{r}
str(netflix)
```
#### Lets convert the type to factor
```{r}
netflix$type= as.factor(netflix$type)
```

#### Investigating Null values
```{r}
NAs=colSums(is.na(netflix))
names(netflix)[NAs>0]
```
#### Dim
```{r}
dim(netflix)
```
#### Total rows with NAs in each column
```{r}
colSums(is.na(netflix))
```

#### Since the columns with NAs dont have much meaning to the dataset we can remove them
```{r}
netflix= netflix %>% 
  select(show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description)
```

## Descriptive Statistics:
### Summarize the distribution of the types of shows (Movies vs. TV Shows).
```{r}
#Distribution of movie type
netflix_type= netflix %>% 
  select(type) %>% 
  group_by(type) %>% 
  summarise(totalcount= n())

# Calculate percentage labels
percentages <- round(netflix_type$totalcount / sum(netflix_type$totalcount) * 100, 1)
labels <- paste(netflix_type$type, percentages, "%", sep = " ")

# Create the pie chart using plotly
fig <- plot_ly(netflix_type, labels = ~type, values = ~totalcount, type = 'pie', 
               textinfo = 'label+percent',
               insidetextorientation = 'radial',
               marker = list(colors = c('red', 'green')))

# Customize the layout
fig <- fig %>% layout(title = 'Distribution of TV Shows and Movies')

# Display the plot
fig
```
### Calculate the number of shows released per year.
```{r}
netflix_releaseyear= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year, type) %>% 
  summarise(total_moveis_or_shows= n(), .groups = "drop")

netflix_release_year= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year) %>% 
  summarise(total_shows= n()) %>% 
  arrange(desc(total_shows)) %>% 
  head(10)

fig4=plot_ly(netflix_release_year, x= ~release_year, y= ~total_shows,type= 'bar',
             text= ~total_shows,
             textposition = "auto",
             marker=list(color="green")) %>% 
  layout(title= "Top 10 years with the highest number of shows produced",
         xaxis= list(title="Years"),
         yaxis= list(title="Total Shows"))
fig4
```

### Analyze the distribution of ratings (e.g., TV-MA, PG-13, etc.).
```{r}
netflix_ratings= netflix %>%
  select(rating) %>% 
  group_by(rating) %>% 
  summarize(total_rating=n())

#the plot
fig3 =plot_ly(netflix_ratings, x= ~rating, y= ~total_rating, type='bar', marker=list(color= "red"))%>% 
  layout(title="Distribution of the ratings",
         xaxis= list(title="Ratings"),
         yaxis= list(title="Total ratings"))

fig3
```
## Trend Analysis:
### Explore the trends in the number of shows added to the platform over time.
```{r}
netflix_release_year_group= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year) %>% 
  summarise(total_shows= n())

netflix_release_year_grouped= netflix_release_year_group %>% 
  mutate(release_year_grouped= case_when(
    release_year < 1930 ~ "1920-1930",
    release_year < 1940 ~ "1930-1940",
    release_year < 1950 ~ "1940-1950",
    release_year < 1960 ~ "1950-1960",
    release_year < 1970 ~ "1950-1970",
    release_year < 1980 ~ "1970-1980",
    release_year < 1990 ~ "1980-1990",
    release_year < 2000 ~ "1990-2000",
    release_year < 2010 ~ "2000-2010",
    release_year < 2020 ~ "2010-2020",
    release_year >= 2020 ~ "2020")) %>% 
  group_by(release_year_grouped) %>% 
  summarise(total_shows = sum(total_shows))

#using plotly to draw the line Chart
fig1 <- plot_ly(netflix_release_year_grouped, x = ~release_year_grouped, y = ~total_shows, type = 'scatter', mode = 'lines') %>%
  layout(title = 'Netflix Shows by Release Year',
         xaxis = list(title = 'Release Year', tickangle= -45),
         yaxis = list(title = 'Total Shows'))

#printing the plot
fig1
```

### Analyze the popularity of different genres over the years.
```{r}
netflix_genre= netflix %>% 
  select(release_year, listed_in) %>% 
  group_by(release_year, listed_in) %>% 
  summarise(total_shows= n(), .groups = "drop")
```
## Genre Analysis:
### Determine the most common genres listed.
```{r}
netflix_genre_common= netflix %>% 
  select(listed_in) %>% 
  group_by(listed_in) %>% 
  summarise(total_count= n()) %>% 
  arrange(desc(total_count)) %>% 
  head(10)

fig5 = plot_ly(netflix_genre_common, y= ~listed_in, x= ~total_count, type= 'bar',
               text= ~total_count,
               textposition= "auto",
               marker= list(color="orange")) %>% 
  layout(
    title= "Top 10 Most common genres Listed",
    yaxis = list(title="Genres"),
    xaxis = list(title="Total Count")
  )

fig5
```

### Analyze the correlation between genres and ratings.
```{r}
netflix_corr_genre= netflix %>% 
  select(listed_in, rating) %>% 
  group_by(listed_in, rating) %>% 
  summarise(total_count = n(), .groups = "drop")
 print(netflix_corr_genre)
```

## Country Analysis:
### Analyze the Countries With the most movies produced
```{r, warning=FALSE}
netflix_country= netflix %>% 
  select(country) %>% 
  mutate(country= paste0(country, ","))

netflix_country = netflix_country %>% 
  separate(col= country, into= c("Country", "Rest"), sep=",")

netflix_country_grouped= netflix_country %>% 
  group_by(Country) %>% 
  summarise(Total_Movies= n()) %>% 
  arrange(desc(Total_Movies))
# Convert empty strings to NA in the 'Country' column
netflix_country_grouped_clean <- netflix_country_grouped %>%
  mutate(Country = na_if(Country, ""))

# Remove rows with NA values
netflix_country_grouped_clean <- na.omit(netflix_country_grouped_clean)
netflix_country_grouped_plot= head(netflix_country_grouped, 10)

fig8 = plot_ly(netflix_country_grouped_plot, x= ~ Country, y= ~Total_Movies, type= 'bar',
               text= ~Total_Movies,
               markers=list(color="yellow")) %>% 
  layout(
    title = "Top 10 Countries with the Highest No of movies Produced",
    xaxis = list(title= "Countries", tickangle= -45),
    yaxis = list(title= "Total Movies Produced"))
 fig8 

```
# Map
```{r}
# # Get world map data for country coordinates
# world_map <- map_data("world")
# 
# # Prepare the data by merging with coordinates
# country_coords <- world_map %>%
#   group_by(region) %>%
#   summarize(
#     lat = mean(lat),
#     lng = mean(long)
#   ) %>%
#   rename(Country = region)
# 
# # Merge country data with coordinates
# map_data <- netflix_country_grouped_clean %>%
#   left_join(country_coords, by = "Country")
# 
# # Filter out rows with missing or invalid coordinates
# map_data_filtered <- map_data %>%
#   filter(!is.na(lat) & !is.na(lng))
# 
# # Create an interactive map with markers
# m <- leaflet(map_data_filtered) %>%
#   addTiles() %>%
#   addMarkers(
#     clusterOptions = markerClusterOptions(),
#     ~lng, ~lat,
#     popup = ~paste("<strong>Country:</strong>", Country, "<br>",
#                    "<strong>Value:</strong>", Total_Movies)
#   )
# 
# # Display the map
# m

```
### Analyze the diversity of content by country.
#### United States
```{r}
netflix_diversity= netflix %>% 
  select(country, listed_in) %>% 
  group_by(country, listed_in) %>% 
  summarise(total_count=n(), .groups = "drop") %>% 
  arrange(desc(total_count))
print(netflix_diversity)
```
## Duration Analysis:
### Compare the average duration of movies vs. TV shows.
```{r, warning=FALSE}
netflix$duration= as.character(netflix$duration)

netflix_duration <- netflix %>%
  select(type, duration) %>%
  separate(col = duration, into = c("duration", "units"), sep = " ")

# Convert duration back to integer
netflix_duration$duration <- as.integer(netflix_duration$duration)

netflix_duration_compison_type= netflix_duration %>% 
  group_by(type) %>%
  summarise(Average_Duration= floor(mean(duration, na.rm = TRUE)))

print(netflix_duration_compison_type)
```
### Analyze the distribution of the number of seasons for TV shows.
```{r}
netflix_tvshows_distribution= netflix %>% 
  select(type, duration) %>% 
  filter(type == "TV Show") %>% 
  group_by(duration) %>% 
  summarise(Frequency_totals= n()) %>% 
  arrange(desc(Frequency_totals))

netflix_tvshows_distribution$duration= factor(netflix_tvshows_distribution$duration, levels= unique(netflix_tvshows_distribution$duration))
# Create a bar chart
fig6 = plot_ly(netflix_tvshows_distribution, x = ~duration, y= ~Frequency_totals, type = 'bar',
            marker=list(color="green")) %>%
  layout(
    title= "Distribution of Seasons in the TV Shows",
    xaxis= list(title ="Duration", tickangle=-45),
    yaxis= list(title= "Frequency")
  )

# Display the plot
fig6
```
